Final Project: Modelling¶

Importing Packages / Libraries / Modules¶

In [15]:
# Basic Data & Visualization Tools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(font_scale = 2)

# Data Import Tools

import sqlite3

# NLP Tools

import spacy

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('punkt')

# Model 1

from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

# Model 2

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, classification_report, confusion
from sklearn.model_selection import learning_curve

# Model 3
    
    # No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 60)

    # Set default font size
plt.rcParams['font.size'] = 24

from IPython.core.pylabtools import figsize

    # Imputing missing values
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

    # Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

from sklearn import tree

    # LIME for explaining predictions
import lime 
import lime.lime_tabular
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/markjones/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/markjones/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

Model 1: TextBlob Naive Bayes Classifier¶

Import the Data we have pre-processed / prepared¶

In [2]:
conn = sqlite3.connect('db/amazon_reviews.db')
In [55]:
# Load the dataset into a DataFrame
df = pd.read_sql('SELECT * FROM Processed_Full_Dataset', conn)

df.head(3)
Out[55]:
index Id ProductId ProfileName HelpfulnessNumerator HelpfulnessDenominator Score Summary dataset clean_review Amazon_Tag helpfulness_pct lemma nouns propn adjectives verbs npav no_tokens Quality_Sentiment_Score Price_Sentiment_Score Overall_Score PN_Labels
0 0 1 B001E4KFG0 delmartian 1 1 5 Good Quality Dog Food Test i have bought several of the vitality canned d... positive 1.0 I have buy several of the vitality can dog foo... vitality dog food product quality product stew... several good finicky most buy can find look process smell appreciate vitality dog food product quality product stew... 52.0 0.35 N/A 0.35 positive
1 1 2 B00813GRG4 dll pa 0 0 1 Not as Advertised Test product arrived labeled as jumbo salted peanut... negative 0.0 product arrive label as jumbo salt peanut ... ... product peanut peanut unsalted error vendor pr... jumbo small sized sure jumbo arrive label salt intend represent product peanut peanut unsalted error vendor pr... 37.0 N/A N/A 0.075 positive
2 2 3 B000LQOCH0 Natalia Corres "Natalia Corres" 1 1 4 "Delight" says it all Test this is a confection that has been around a fe... positive 1.0 this be a confection that have be around a few... confection century light citrus nut case filbe... gelatin heaven c.s . lewis few pillowy tiny powdered tiny flavorful yummy... cut coat recommend seduce sell confection century light citrus nut case filbe... 114.0 N/A N/A -0.2 negative
In [27]:
len(df)
Out[27]:
568454

The dataset is too large, we will sample it to get a smaller section to train the Naive Bayes Classifier on

In [5]:
df_nbc = df.sample(frac=0.02, random_state=786)
In [6]:
len(df_nbc)
Out[6]:
11369

We want to try the Naive Bayes using the Amazon tag (i.e. whether the score was >=3 it was positive)

In [7]:
x = df_nbc['clean_review']
y = df_nbc['Amazon_Tag']
In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.5)
In [9]:
#create tuples

train = [x for x in zip(x_train,y_train)]
test = [x for x in zip(x_test, y_test)]

Train the model using a Naive Bayes classifier¶

In [10]:
#Train the classifier on the cleaned corpus against the labelled positive/negative Tag

classifier = NaiveBayesClassifier(train)

Test the TextBlob Naive Bayes Classifier Model¶

(this will be used as our benchmark)

In [11]:
#test the accuracy

print(classifier.accuracy(test))
0.8504837291116975

Model 2: Logistic Regression using the TD-IDF Vectors¶

In [12]:
# TF-IDF vectorization of the text data
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

# Example: Logistic Regression
model = LogisticRegression()
model.fit(x_train_tfidf, y_train)

# Predictions on the actual test set
y_pred = model.predict(x_test_tfidf)

unique_labels = np.unique(y_test)
print("Unique labels in y_test:", unique_labels)
Unique labels in y_test: ['negative' 'positive']
In [13]:
#print("Actual Labels (y_test):", np.unique(y_test))
#print("Predicted Labels (y_pred):", np.unique(y_pred))


print(len(y_test))
print(len(y_pred))
5685
5685
In [14]:
# Evaluate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, pos_label='positive')


recall = recall_score(y_test, y_pred, pos_label='positive')
f1 = f1_score(y_test, y_pred, pos_label='positive')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test_tfidf)[:, 1], pos_label='positive')
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend()
plt.show()

# Plot Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, model.predict_proba(x_test_tfidf)[:, 1], pos_label='positive')
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label="Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()

# Plot Learning Curve
train_sizes, train_scores, test_scores = learning_curve(
    model, x_train_tfidf, y_train, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10)
)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label="Training Accuracy")
plt.plot(train_sizes, np.mean(test_scores, axis=1), label="Testing Accuracy")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.show()
Precision: 0.8661
Recall: 0.9961
F1-score: 0.9266
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [19]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

    negative       0.87      0.14      0.25       866
    positive       0.87      1.00      0.93      4819

    accuracy                           0.87      5685
   macro avg       0.87      0.57      0.59      5685
weighted avg       0.87      0.87      0.82      5685

The above Recall for Negative shows that it is not performing well. This is likely due to the dataset being unbalanced.

Model 3: Gradient Boosting Regressor¶

Following Week 6 Tutorial: Model Interpretation¶

https://towardsdatascience.com/a-complete-machine-learning-walk-through-in-python-part-three-388834e8804b

This tutorial uses a Gradient Boosting Regressor, as such, we will use some different features here.

In [44]:
df.dtypes
Out[44]:
index                        int64
Id                           int64
ProductId                   object
ProfileName                 object
HelpfulnessNumerator         int64
HelpfulnessDenominator       int64
Score                        int64
Summary                     object
dataset                     object
clean_review                object
Amazon_Tag                  object
helpfulness_pct            float64
lemma                       object
nouns                       object
propn                       object
adjectives                  object
verbs                       object
npav                        object
no_tokens                  float64
Quality_Sentiment_Score     object
Price_Sentiment_Score       object
Overall_Score               object
PN_Labels                   object
dtype: object
In [45]:
print(len(df))
568454
In [56]:
df['Quality_Sentiment_Score'] = (
    pd.to_numeric(df['Quality_Sentiment_Score'],
                  errors='coerce')
      .fillna(0)
    )
In [57]:
df['Price_Sentiment_Score'] = (
    pd.to_numeric(df['Price_Sentiment_Score'],
                  errors='coerce')
      .fillna(0)
    )
In [58]:
df['Quality_Sentiment_Score'] = (
    pd.to_numeric(df['Overall_Score'],
                  errors='coerce')
      .fillna(0)
    )
In [59]:
df.head(3)
Out[59]:
index Id ProductId ProfileName HelpfulnessNumerator HelpfulnessDenominator Score Summary dataset clean_review Amazon_Tag helpfulness_pct lemma nouns propn adjectives verbs npav no_tokens Quality_Sentiment_Score Price_Sentiment_Score Overall_Score PN_Labels
0 0 1 B001E4KFG0 delmartian 1 1 5 Good Quality Dog Food Test i have bought several of the vitality canned d... positive 1.0 I have buy several of the vitality can dog foo... vitality dog food product quality product stew... several good finicky most buy can find look process smell appreciate vitality dog food product quality product stew... 52.0 0.350 0.0 0.35 positive
1 1 2 B00813GRG4 dll pa 0 0 1 Not as Advertised Test product arrived labeled as jumbo salted peanut... negative 0.0 product arrive label as jumbo salt peanut ... ... product peanut peanut unsalted error vendor pr... jumbo small sized sure jumbo arrive label salt intend represent product peanut peanut unsalted error vendor pr... 37.0 0.075 0.0 0.075 positive
2 2 3 B000LQOCH0 Natalia Corres "Natalia Corres" 1 1 4 "Delight" says it all Test this is a confection that has been around a fe... positive 1.0 this be a confection that have be around a few... confection century light citrus nut case filbe... gelatin heaven c.s . lewis few pillowy tiny powdered tiny flavorful yummy... cut coat recommend seduce sell confection century light citrus nut case filbe... 114.0 -0.200 0.0 -0.2 negative
In [60]:
features_keep = 'helpfulness_pct', 'no_tokens', 'Quality_Sentiment_Score', 'Price_Sentiment_Score', 'Overall_Score'
label_keep = 'Amazon_Tag'
In [61]:
x = df.drop(columns=[col for col in df if col not in features_keep])
x.head(3)
Out[61]:
helpfulness_pct no_tokens Quality_Sentiment_Score Price_Sentiment_Score Overall_Score
0 1.0 52.0 0.350 0.0 0.35
1 0.0 37.0 0.075 0.0 0.075
2 1.0 114.0 -0.200 0.0 -0.2
In [62]:
y = df.drop(columns=[col for col in df if col not in label_keep])
y.head(3)
Out[62]:
Amazon_Tag
0 positive
1 negative
2 positive
In [71]:
y['Amazon_Tag'].replace({'positive': 1, 'negative': 0}, inplace=True)
In [72]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.5)
In [73]:
# Create an imputer object with a median filling strategy
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
In [74]:
# Train on the training features
imputer.fit(x_train)
Out[74]:
SimpleImputer(strategy='median')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SimpleImputer(strategy='median')
In [75]:
# Transform both training data and testing data
x_train = imputer.transform(x_train)
x_test = imputer.transform(x_test)

# Sklearn wants the labels as one-dimensional vectors
y_train = np.array(y_train).reshape((-1,))
y_test = np.array(y_test).reshape((-1,))
In [76]:
# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))
In [85]:
model = GradientBoostingRegressor(loss='squared_error', max_depth=5, max_features=None,
                                  min_samples_leaf=6, min_samples_split=6, 
                                  n_estimators=800, random_state=42)

model.fit(x_train, y_train)
Out[85]:
GradientBoostingRegressor(max_depth=5, min_samples_leaf=6, min_samples_split=6,
                          n_estimators=800, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(max_depth=5, min_samples_leaf=6, min_samples_split=6,
                          n_estimators=800, random_state=42)
In [86]:
#  Make predictions on the test set
model_pred = model.predict(x_test)

print('Final Model Performance on the test set: MAE = %0.4f' % mae(y_test, model_pred))
Final Model Performance on the test set: MAE = 0.2200
In [87]:
# Extract the feature importances into a dataframe
feature_results = pd.DataFrame({'feature': list(x.columns), 
                                'importance': model.feature_importances_})

# Show the top 10 most important
feature_results = feature_results.sort_values('importance', ascending = False).reset_index(drop=True)

feature_results.head(10)
Out[87]:
feature importance
0 Overall_Score 0.427683
1 Quality_Sentiment_Score 0.242368
2 no_tokens 0.198608
3 helpfulness_pct 0.113719
4 Price_Sentiment_Score 0.017623
In [88]:
figsize(12, 10)
plt.style.use('fivethirtyeight')

# Plot the 10 most important features in a horizontal bar chart
feature_results.loc[:9, :].plot(x = 'feature', y = 'importance', 
                                 edgecolor = 'k',
                                 kind='barh', color = 'blue');
plt.xlabel('Relative Importance', size = 20); plt.ylabel('')
plt.title('Feature Importances from Random Forest', size = 30);
No description has been provided for this image
In [90]:
# Extract the names of the most important features
most_important_features = feature_results['feature'][:10]

# Find the index that corresponds to each feature name
indices = [list(x.columns).index(x_train) for x_train in most_important_features]

# Keep only the most important features
x_reduced = x_train[:, indices]
x_test_reduced = x_test[:, indices]

print('Most important training features shape: ', x_reduced.shape)
print('Most important testing  features shape: ', x_test_reduced.shape)
Most important training features shape:  (284227, 5)
Most important testing  features shape:  (284227, 5)
In [91]:
lr = LinearRegression()

# Fit on full set of features
lr.fit(x_train, y_train)
lr_full_pred = lr.predict(x_test)

# Fit on reduced set of features
lr.fit(x_reduced, y_train)
lr_reduced_pred = lr.predict(x_test_reduced)

# Display results
print('Linear Regression Full Results: MAE =    %0.4f.' % mae(y_test, lr_full_pred))
print('Linear Regression Reduced Results: MAE = %0.4f.' % mae(y_test, lr_reduced_pred))
Linear Regression Full Results: MAE =    0.2351.
Linear Regression Reduced Results: MAE = 0.2351.
In [92]:
# Create the model with the same hyperparamters
model_reduced = GradientBoostingRegressor(loss='squared_error', max_depth=5, max_features=None,
                                  min_samples_leaf=6, min_samples_split=6, 
                                  n_estimators=800, random_state=42)

# Fit and test on the reduced set of features
model_reduced.fit(x_reduced, y_train)
model_reduced_pred = model_reduced.predict(x_test_reduced)

print('Gradient Boosted Reduced Results: MAE = %0.4f' % mae(y_test, model_reduced_pred))
Gradient Boosted Reduced Results: MAE = 0.2200

The above shows that the Gradient Boosted Reduced Results did not change from the first attempt at Gradient Boosting.

Model 4: Locally Interpretable Model-agnostic Explanations (LIME)¶

Following Week 6 Tutorial: Model Interpretation

https://towardsdatascience.com/a-complete-machine-learning-walk-through-in-python-part-three-388834e8804b

In [94]:
# Find the residuals
residuals = abs(model_reduced_pred - y_test)
    
# Exact the worst and best prediction
wrong = x_test_reduced[np.argmax(residuals), :]
right = x_test_reduced[np.argmin(residuals), :]
In [95]:
# Create a lime explainer object
explainer = lime.lime_tabular.LimeTabularExplainer(training_data = x_reduced, 
                                                   mode = 'regression',
                                                   training_labels = y_train,
                                                   feature_names = list(most_important_features))

Here is an example from the LIME model that was predicted incorrectly.¶

We can see the reasons why this was predicted wrong

In [96]:
# Display the predicted and true value for the wrong instance
print('Prediction: %0.4f' % model_reduced.predict(wrong.reshape(1, -1)))
print('Actual Value: %0.4f' % y_test[np.argmax(residuals)])

# Explanation for wrong prediction
wrong_exp = explainer.explain_instance(data_row = wrong, 
                                       predict_fn = model_reduced.predict)

# Plot the prediction explaination
wrong_exp.as_pyplot_figure();
plt.title('Explanation of Prediction', size = 28);
plt.xlabel('Effect on Prediction', size = 22);
/var/folders/jz/06jb_cbd77v_7jmcq3q91s9m0000gn/T/ipykernel_96306/2438247814.py:2: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
  print('Prediction: %0.4f' % model_reduced.predict(wrong.reshape(1, -1)))
Prediction: 1.0847
Actual Value: 0.0000
No description has been provided for this image
In [97]:
wrong_exp.show_in_notebook(show_predicted_value=False)

Here is an example from the LIME model that was predicted Correct.¶

We can see the reasons why this was predicted correct

In [98]:
# Display the predicted and true value for the wrong instance
print('Prediction: %0.4f' % model_reduced.predict(right.reshape(1, -1)))
print('Actual Value: %0.4f' % y_test[np.argmin(residuals)])

# Explanation for wrong prediction
right_exp = explainer.explain_instance(right, model_reduced.predict, num_features=10)
right_exp.as_pyplot_figure();
plt.title('Explanation of Prediction', size = 28);
plt.xlabel('Effect on Prediction', size = 22);
/var/folders/jz/06jb_cbd77v_7jmcq3q91s9m0000gn/T/ipykernel_96306/2277720504.py:2: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
  print('Prediction: %0.4f' % model_reduced.predict(right.reshape(1, -1)))
Prediction: 1.0000
Actual Value: 1.0000
No description has been provided for this image
In [100]:
right_exp.show_in_notebook(show_predicted_value=False)
In [101]:
# Extract a single tree
single_tree = model_reduced.estimators_[105][0]

tree.export_graphviz(single_tree, out_file = 'images/amazon_tree.dot',
                     rounded = True, 
                     feature_names = most_important_features,
                     filled = True)

single_tree
Out[101]:
DecisionTreeRegressor(criterion='friedman_mse', max_depth=5, min_samples_leaf=6,
                      min_samples_split=6,
                      random_state=RandomState(MT19937) at 0x5D3C66940)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(criterion='friedman_mse', max_depth=5, min_samples_leaf=6,
                      min_samples_split=6,
                      random_state=RandomState(MT19937) at 0x5D3C66940)
In [105]:
!dot -Tpng images/amazon_tree.dot -o images/amazon_tree.png
In [106]:
from IPython.display import Image
Image(filename='images/amazon_tree.png') 
Out[106]:
No description has been provided for this image
In [107]:
tree.export_graphviz(single_tree, out_file = 'images/amazon_tree_small.dot',
                     rounded = True, feature_names = most_important_features,
                     filled = True, max_depth = 3)
In [108]:
!dot -Tpng images/amazon_tree_small.dot -o images/amazon_tree_small.png
In [109]:
Image(filename='images/amazon_tree_small.png') 
Out[109]:
No description has been provided for this image
In [ ]: